# code/stage3_macro.py

import pandas as pd
import numpy as np
from paths import MACRO_FILES, PROCESSED_DATA_DIR

# 1. V-Dem data
vdem = pd.read_csv(MACRO_FILES.vdem, usecols=[
    'country_text_id', 'year', 'v2x_libdem', 'v2x_polyarchy'
])
vdem = vdem.rename(columns={
    'country_text_id': 'C_ALPHAN',
    'year': 'YEAR',
    'v2x_libdem': 'libdem_index',
    'v2x_polyarchy': 'polyarchy_index'
})
vdem['C_ALPHAN'] = vdem['C_ALPHAN'].astype(str)
vdem['YEAR'] = vdem['YEAR'].astype(int)

# 2. WDI GDP per capita (PPP) and log transform
wdi = pd.read_csv(MACRO_FILES.wdi, usecols=[
    'C_ALPHAN', 'Year', 'GDP_per_capita_PPP'
])
wdi = wdi.rename(columns={'Year': 'YEAR'})
wdi['C_ALPHAN'] = wdi['C_ALPHAN'].astype(str)
wdi['YEAR'] = wdi['YEAR'].astype(int)
wdi['log_gdp_ppp'] = np.log(wdi['GDP_per_capita_PPP'].replace(0, np.nan))
wdi = wdi[['C_ALPHAN', 'YEAR', 'log_gdp_ppp']]

# 3. GII data
gii = pd.read_csv(MACRO_FILES.gii, usecols=['C_ALPHAN', 'Year', 'GII'])
gii = gii.rename(columns={'Year': 'YEAR', 'GII': 'gii_index'})
gii['C_ALPHAN'] = gii['C_ALPHAN'].astype(str)
gii['YEAR'] = gii['YEAR'].astype(int)

# 4. Merge all macro
macro_dfs = [vdem, wdi, gii]
final_macro = macro_dfs[0]
for df in macro_dfs[1:]:
    final_macro = pd.merge(final_macro, df, on=['C_ALPHAN', 'YEAR'], how='outer')

final_macro = final_macro.drop_duplicates(subset=['C_ALPHAN', 'YEAR'])
final_macro.to_parquet(PROCESSED_DATA_DIR / 'final_macro_combined_all.parquet', index=False)
